// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "net/tools/tld_cleanup/tld_cleanup_util.h"

#include "base/files/file_util.h"
#include "base/logging.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "url/gurl.h"
#include "url/third_party/mozilla/url_parse.h"

namespace {

const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

const int kExceptionRule = 1;
const int kWildcardRule = 2;
const int kPrivateRule = 4;
}

namespace net {
namespace tld_cleanup {

    // Writes the list of domain rules contained in the 'rules' set to the
    // 'outfile', with each rule terminated by a LF.  The file must already have
    // been created with write access.
    bool WriteRules(const RuleMap& rules, const base::FilePath& outfile)
    {
        std::string data;
        data.append("%{\n"
                    "// Copyright 2012 The Chromium Authors. All rights reserved.\n"
                    "// Use of this source code is governed by a BSD-style license "
                    "that can be\n"
                    "// found in the LICENSE file.\n\n"
                    "// This file is generated by net/tools/tld_cleanup/.\n"
                    "// DO NOT MANUALLY EDIT!\n"
                    "%}\n"
                    "struct DomainRule {\n"
                    "  int name_offset;\n"
                    "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
                    "};\n"
                    "%%\n");

        for (RuleMap::const_iterator i = rules.begin(); i != rules.end(); ++i) {
            data.append(i->first);
            data.append(", ");
            int type = 0;
            if (i->second.exception) {
                type = kExceptionRule;
            } else if (i->second.wildcard) {
                type = kWildcardRule;
            }
            if (i->second.is_private) {
                type += kPrivateRule;
            }
            data.append(base::IntToString(type));
            data.append("\n");
        }

        data.append("%%\n");

        int written = base::WriteFile(outfile,
            data.data(),
            static_cast<int>(data.size()));

        return written == static_cast<int>(data.size());
    }

    // Adjusts the rule to a standard form: removes single extraneous dots and
    // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
    // valid; logs a warning and returns kWarning if it is probably invalid; and
    // logs an error and returns kError if the rule is (almost) certainly invalid.
    NormalizeResult NormalizeRule(std::string* domain, Rule* rule)
    {
        NormalizeResult result = kSuccess;

        // Strip single leading and trailing dots.
        if (domain->at(0) == '.')
            domain->erase(0, 1);
        if (domain->empty()) {
            LOG(WARNING) << "Ignoring empty rule";
            return kWarning;
        }
        if (domain->at(domain->size() - 1) == '.')
            domain->erase(domain->size() - 1, 1);
        if (domain->empty()) {
            LOG(WARNING) << "Ignoring empty rule";
            return kWarning;
        }

        // Allow single leading '*.' or '!', saved here so it's not canonicalized.
        size_t start_offset = 0;
        if (domain->at(0) == '!') {
            domain->erase(0, 1);
            rule->exception = true;
        } else if (domain->find("*.") == 0) {
            domain->erase(0, 2);
            rule->wildcard = true;
        }
        if (domain->empty()) {
            LOG(WARNING) << "Ignoring empty rule";
            return kWarning;
        }

        // Warn about additional '*.' or '!'.
        if (domain->find("*.", start_offset) != std::string::npos || domain->find('!', start_offset) != std::string::npos) {
            LOG(WARNING) << "Keeping probably invalid rule: " << *domain;
            result = kWarning;
        }

        // Make a GURL and normalize it, then get the host back out.
        std::string url = "http://";
        url.append(*domain);
        GURL gurl(url);
        const std::string& spec = gurl.possibly_invalid_spec();
        url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
        if (host.len < 0) {
            LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << *domain;
            return kError;
        }
        if (!gurl.is_valid()) {
            LOG(WARNING) << "Keeping rule that GURL says is invalid: " << *domain;
            result = kWarning;
        }
        domain->assign(spec.substr(host.begin, host.len));

        return result;
    }

    NormalizeResult NormalizeDataToRuleMap(const std::string data,
        RuleMap* rules)
    {
        CHECK(rules);
        // We do a lot of string assignment during parsing, but simplicity is more
        // important than performance here.
        std::string domain;
        NormalizeResult result = kSuccess;
        size_t line_start = 0;
        size_t line_end = 0;
        bool is_private = false;
        RuleMap extra_rules;
        int begin_private_length = arraysize(kBeginPrivateDomainsComment) - 1;
        int end_private_length = arraysize(kEndPrivateDomainsComment) - 1;
        while (line_start < data.size()) {
            if (line_start + begin_private_length < data.size() && !data.compare(line_start, begin_private_length, kBeginPrivateDomainsComment)) {
                is_private = true;
                line_end = line_start + begin_private_length;
            } else if (line_start + end_private_length < data.size() && !data.compare(line_start, end_private_length, kEndPrivateDomainsComment)) {
                is_private = false;
                line_end = line_start + end_private_length;
            } else if (line_start + 1 < data.size() && data[line_start] == '/' && data[line_start + 1] == '/') {
                // Skip comments.
                line_end = data.find_first_of("\r\n", line_start);
                if (line_end == std::string::npos)
                    line_end = data.size();
            } else {
                // Truncate at first whitespace.
                line_end = data.find_first_of("\r\n \t", line_start);
                if (line_end == std::string::npos)
                    line_end = data.size();
                domain.assign(data.data(), line_start, line_end - line_start);

                Rule rule;
                rule.wildcard = false;
                rule.exception = false;
                rule.is_private = is_private;
                NormalizeResult new_result = NormalizeRule(&domain, &rule);
                if (new_result != kError) {
                    // Check the existing rules to make sure we don't have an exception and
                    // wildcard for the same rule, or that the same domain is listed as both
                    // private and not private. If we did, we'd have to update our
                    // parsing code to handle this case.
                    CHECK(rules->find(domain) == rules->end())
                        << "Duplicate rule found for " << domain;

                    (*rules)[domain] = rule;
                    // Add true TLD for multi-level rules.  We don't add them right now, in
                    // case there's an exception or wild card that either exists or might be
                    // added in a later iteration.  In those cases, there's no need to add
                    // it and it would just slow down parsing the data.
                    size_t tld_start = domain.find_last_of('.');
                    if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
                        std::string extra_rule_domain = domain.substr(tld_start + 1);
                        RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
                        Rule extra_rule;
                        extra_rule.exception = false;
                        extra_rule.wildcard = false;
                        if (iter == extra_rules.end()) {
                            extra_rule.is_private = is_private;
                        } else {
                            // A rule already exists, so we ensure that if any of the entries is
                            // not private the result should be that the entry is not private.
                            // An example is .au which is not listed as a real TLD, but only
                            // lists second-level domains such as com.au. Subdomains of .au
                            // (eg. blogspot.com.au) are also listed in the private section,
                            // which is processed later, so this ensures that the real TLD
                            // (eg. .au) is listed as public.
                            extra_rule.is_private = is_private && iter->second.is_private;
                        }
                        extra_rules[extra_rule_domain] = extra_rule;
                    }
                }
                result = std::max(result, new_result);
            }

            // Find beginning of next non-empty line.
            line_start = data.find_first_of("\r\n", line_end);
            if (line_start == std::string::npos)
                line_start = data.size();
            line_start = data.find_first_not_of("\r\n", line_start);
            if (line_start == std::string::npos)
                line_start = data.size();
        }

        for (RuleMap::const_iterator iter = extra_rules.begin();
             iter != extra_rules.end();
             ++iter) {
            if (rules->find(iter->first) == rules->end()) {
                (*rules)[iter->first] = iter->second;
            }
        }

        return result;
    }

    NormalizeResult NormalizeFile(const base::FilePath& in_filename,
        const base::FilePath& out_filename)
    {
        RuleMap rules;
        std::string data;
        if (!base::ReadFileToString(in_filename, &data)) {
            LOG(ERROR) << "Unable to read file";
            // We return success since we've already reported the error.
            return kSuccess;
        }

        NormalizeResult result = NormalizeDataToRuleMap(data, &rules);

        if (!WriteRules(rules, out_filename)) {
            LOG(ERROR) << "Error(s) writing output file";
            result = kError;
        }

        return result;
    }

} // namespace tld_cleanup
} // namespace net
